As per RFC1738 (http://www.w3.org/Addressing/rfc1738.txt, [Page 3]),
the characters ";", "/", "?", ":", "@", "=" and "&" should not be encoded
or decoded by wget, because they have a special meaning. Encoding or
decoding these characters may change the semantics of a URL.
For example, "http://abc.xyz/abc?def" and "http://abc.xyz/abc%3Fdef" are
not the same location. But when invoking the command
$ wget 'http://abc.xyz/abc%3Fdef'
, wget will change "%3F" to be "?" that will lead to the wrong location.
Also, the "+" character (not listed in RFC), should retain its
encoded/decoded status. Because it have the different meaning when
appear in a CGI query. For example,
"http://abc.xyz/abc.cgi?var1=a+b" means that var1 = "a b"
"http://abc.xyz/abc.cgi?var1=a%2Bb" means that var1 = "a+b"
The following is a patch for this bug to wget 1.5.3.
diff -ur wget-1.5.3.orig/src/url.c wget-1.5.3/src/url.c
--- wget-1.5.3.orig/src/url.c Fri Sep 11 07:23:26 1998
+++ wget-1.5.3/src/url.c Mon Oct 23 19:49:04 2000
@@ -51,6 +51,12 @@
/* URL separator (for findurl) */
#define URL_SEPARATOR "!\"#'(),>`{}|<>"
+/* A list of characters reserved for special meaning, as per RFC1738.
+ Encoding or decoding these characters may change the semantics of a URL.
+ '+' was added because "+" and "%2B" have the different meaning when they
+ appear in a cgi query. */
+#define URL_RESERVED ";/?:@=&" "+"
+
/* A list of unsafe characters for encoding, as per RFC1738. '@' and
':' (not listed in RFC) were added because of user/password
encoding, and \033 for safe printing. */
@@ -73,6 +79,16 @@
} \
} while (0)
+#define URL_CLEANSE2(s, url_unsafe) do \
+{ \
+ if (1) \
+ { \
+ char *uc_tmp = encode_string2 (s, url_unsafe);\
+ free (s); \
+ (s) = uc_tmp; \
+ } \
+} while (0)
+
/* Is a directory "."? */
#define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
/* Is a directory ".."? */
@@ -184,7 +200,7 @@
literally. */
static void
-decode_string (char *s)
+decode_string (char *s, const char *url_reserved)
{
char *p = s;
@@ -203,6 +219,13 @@
continue;
}
*p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
+ if (strchr(url_reserved, *p))
+ {
+ *p = '%';
+ *(s + 1) = toupper(*(s + 1));
+ *(s + 2) = toupper(*(s + 2));
+ continue;
+ }
s += 2;
}
}
@@ -237,6 +260,46 @@
*p = '\0';
return res;
}
+
+char *
+encode_string2 (const char *s, const char *url_unsafe)
+{
+ const char *b;
+ char *p, *res;
+ int i;
+
+ b = s;
+ for (i = 0; *s; s++, i++)
+ {
+ if (*s == '%' && *(s + 1) && *(s + 2)
+ && (ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
+ continue;
+ if (strchr (url_unsafe, *s))
+ i += 2; /* Two more characters (hex digits) */
+ }
+ res = (char *)xmalloc (i + 1);
+ s = b;
+ for (p = res; *s; s++)
+ {
+ if (*s == '%' && *(s + 1) && *(s + 2)
+ && (ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
+ {
+ *p++ = *s;
+ continue;
+ }
+ if (strchr (url_unsafe, *s))
+ {
+ const unsigned char c = *s;
+ *p++ = '%';
+ *p++ = HEXD2ASC (c >> 4);
+ *p++ = HEXD2ASC (c & 0xf);
+ }
+ else
+ *p++ = *s;
+ }
+ *p = '\0';
+ return res;
+}
/* Returns the proto-type if URL's protocol is supported, or
URLUNKNOWN if not. */
@@ -471,12 +534,22 @@
/* Parse the username and password (if existing). */
parse_uname (url, &u->user, &u->passwd);
/* Decode the strings, as per RFC 1738. */
- decode_string (u->host);
- decode_string (u->path);
+ decode_string (u->host, "");
+ // To prevent the case that, "%%32%36" => "%26" (that char(26) == '&') //
+ // that it should be, "%%32%36" => "%2526" //
+ // So, quote the unsafe "%" //
+ {
+ char* path_temp = xstrdup (u->path);
+ URL_CLEANSE2 (path_temp, "%");
+ free (u->path); u->path = (char *)xmalloc (strlen (path_temp) + 8);
+ strcpy (u->path, path_temp);
+ free (path_temp);
+ };
+ decode_string (u->path, URL_RESERVED "%" URL_UNSAFE);
if (u->user)
- decode_string (u->user);
+ decode_string (u->user, "");
if (u->passwd)
- decode_string (u->passwd);
+ decode_string (u->passwd, "");
/* Parse the directory. */
parse_dir (u->path, &u->dir, &u->file);
DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
@@ -498,7 +571,7 @@
strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
strcat (u->path, *u->dir ? "/" : "");
strcat (u->path, u->file);
- URL_CLEANSE (u->path);
+ URL_CLEANSE2 (u->path, URL_UNSAFE);
/* Create the clean URL. */
u->url = str_url (u, 0);
return URLOK;
@@ -618,6 +691,8 @@
return '\0';
}
+#define CLEANDUP2(x, url_unsafe) (1 ? encode_string2 (x, url_unsafe) \
+ : xstrdup (x))
/* Return the URL as fine-formed string, with a proper protocol, port
number, directory and optional user/password. If HIDE is non-zero,
password will be hidden. The forbidden characters in the URL will
@@ -636,8 +711,8 @@
return NULL;
proto_name = sup_protos[i].name;
host = CLEANDUP (u->host);
- dir = CLEANDUP (u->dir);
- file = CLEANDUP (u->file);
+ dir = CLEANDUP2 (u->dir, URL_UNSAFE);
+ file = CLEANDUP2 (u->file, URL_UNSAFE);
user = passwd = NULL;
if (u->user)
user = CLEANDUP (u->user);
@@ -1167,15 +1242,23 @@
}
free (host);
+ dir = xstrdup (dir);
+ URL_CLEANSE2 (dir, URL_UNSAFE);
/* If there is a prefix, prepend it. */
if (*dirpref)
{
- char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
+// char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
+ char *newdir;
+ dirpref = xstrdup(dirpref);
+ URL_CLEANSE (dirpref);
+ newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
+ free(dir); free(dirpref);
dir = newdir;
+ dir = xstrdup (dir);
}
- dir = xstrdup (dir);
- URL_CLEANSE (dir);
+// dir = xstrdup (dir);
+// URL_CLEANSE (dir);
l = strlen (dir);
if (l && dir[l - 1] == '/')
dir[l - 1] = '\0';